New chunk Ctrl+Alt+I
Execute chunk Ctrl+Shift+Enter
Execute all chunks Ctrl+Alt+R
HTML preview Ctrl+Shift+K
library(readr)
library(dplyr)
library(tidyverse)
library(ggplot2)
library(reshape2)
library(stats)
data <- read.csv("~/4year/2semester/dtII/CSVs/HEIs.csv",
colClasses = c(tweet_id = "character"))
# Modifying created_at type so that attribute can be used more easily
data$created_at <- as.POSIXct(data$created_at,
format= "%Y-%m-%dT%H:%M:%S", tz="UTC")
#View(data)
summary(data)
id tweet_id text type bookmark_count favorite_count retweet_count reply_count
Length:11728 Length:11728 Length:11728 Length:11728 Min. : 0.000 Min. : 0.00 Min. : 0.00 Min. : 0.000
Class :character Class :character Class :character Class :character 1st Qu.: 0.000 1st Qu.: 7.00 1st Qu.: 2.00 1st Qu.: 0.000
Mode :character Mode :character Mode :character Mode :character Median : 0.000 Median : 20.00 Median : 5.00 Median : 1.000
Mean : 1.543 Mean : 60.67 Mean : 10.62 Mean : 3.888
3rd Qu.: 1.000 3rd Qu.: 57.00 3rd Qu.: 11.00 3rd Qu.: 3.000
Max. :418.000 Max. :41655.00 Max. :4214.00 Max. :2317.000
view_count created_at hashtags urls media_type media_urls
Min. : 5 Min. :2022-08-01 03:05:11.00 Length:11728 Length:11728 Length:11728 Length:11728
1st Qu.: 2643 1st Qu.:2022-10-19 12:56:27.00 Class :character Class :character Class :character Class :character
Median : 6240 Median :2023-01-29 08:26:30.00 Mode :character Mode :character Mode :character Mode :character
Mean : 14182 Mean :2023-01-30 07:39:34.96
3rd Qu.: 16058 3rd Qu.:2023-05-05 14:16:43.25
Max. :7604544 Max. :2023-08-31 20:50:01.00
NA's :4840
# View of how many entries each HEI has
number_interactions <- data %>%
group_by(id) %>% summarise(count = n())
number_interactions
# Since complutense only has 1 entry we can't learn anything from it, so we removed it
data <- data[data$id != "complutense.csv", ]
# Visualization of number all posts, just tweets and just replys
number_posts <- data %>%
group_by(id) %>% summarise(count = n())
number_tweets <- data[data$type == "Tweet", ] %>%
group_by(id) %>% summarise(count = n())
number_replys <- data[data$type == "Reply", ] %>%
group_by(id) %>% summarise(count = n())
print(number_posts)
print(number_tweets)
print(number_replys)
# Merging the counts of tweets (count.y) and replies (count) with the count of posts (count.x)
data_ratio <- merge(number_posts, number_tweets, by = "id", all = TRUE)
data_ratio <- merge(data_ratio, number_replys, by = "id", all = TRUE)
data_ratio$percentage_tweets <- (data_ratio$count.y / data_ratio$count.x) * 100
data_ratio$percentage_replies <- (data_ratio$count / data_ratio$count.x) * 100
data_ratio <- data_ratio[, c("id", "percentage_tweets", "percentage_replies")]
print(data_ratio)
data_tweets <- data[data$type == "Tweet", ]
data_tweets
average_tweets <- function(timeframe = "days"){
# Calculation of the timeframe between earliest and latest post for each HEI
date_range <- data_tweets %>%
group_by(id) %>%
summarise(min_date = min(created_at),
max_date = max(created_at)) %>%
mutate(num_days = as.numeric(difftime(max_date, min_date, units = timeframe)))
# Naming the column respecting the timeframe
column_name <- paste0("avg_tweets_per_", timeframe)
# Calculation of the number of tweets per day for each HEI
tweets_per_timeframe <- number_tweets %>%
left_join(date_range, by = "id") %>%
mutate(!!column_name := count / num_days)
print(tweets_per_timeframe)
return(tweets_per_timeframe)
}
tweets_per_day <- average_tweets()
tweets_per_week <- average_tweets(timeframe = "weeks")
barplot(tweets_per_day$avg_tweets_per_days,
names.arg = tweets_per_day$id,
main = "Average Tweets per Day",
xlab = "HEI",
ylab = "Average Number of Tweets",
ylim = c(0, max(tweets_per_day$avg_tweets_per_days) + 1),
las = 2,
col = "#3498DB")
# Adding text labels over each bar and aligning it with the center of each bar
text(x = barplot(tweets_per_day$avg_tweets_per_days, plot = FALSE),
y = tweets_per_day$avg_tweets_per_days,
labels = round(tweets_per_day$avg_tweets_per_days, 2),
pos = 3)
barplot(tweets_per_week$avg_tweets_per_weeks,
names.arg = tweets_per_week$id,
main = "Average Tweets per Week",
xlab = "HEI",
ylab = "Average Number of Tweets",
ylim = c(0, max(tweets_per_week$avg_tweets_per_weeks) + 5),
las = 2,
col = "#E74C3C")
text(x = barplot(tweets_per_week$avg_tweets_per_weeks, plot = FALSE),
y = tweets_per_week$avg_tweets_per_weeks,
labels = round(tweets_per_week$avg_tweets_per_weeks, 2),
pos = 3)
intervals <- list(
interval1 = as.POSIXct(c("2022-08-31", "2022-12-15")),
interval2 = as.POSIXct(c("2023-01-04", "2023-04-01")),
interval3 = as.POSIXct(c("2023-04-14", "2023-06-15"))
)
check_interval <- function(date) {
for (i in 1:length(intervals)) {
interval_start <- intervals[[i]][1]
interval_end <- intervals[[i]][2]
if (date >= interval_start & date <= interval_end) {
return(TRUE)
}
}
return(FALSE)
}
data_tweets$academic_year <- sapply(data_tweets$created_at, check_interval)
print(data.frame(id = data_tweets$id, academic_year = data_tweets$academic_year))
barplot(table(data_tweets$academic_year),
main = "Number of Tweets per Timeframe",
xlab = "Time",
ylab = "Count",
ylim = c(0, max(table(data_tweets$academic_year)) + 1000),
names.arg = c("Vacation", "Academic"),
col = c("#8E44AD", "#F1C40F"))
text(x = barplot(data_tweets$academic_year, plot = FALSE),
y = table(data_tweets$academic_year) + 0.5,
labels = table(data_tweets$academic_year),
pos = 3)
analyze_tweets <- function(academic_year_filter = TRUE) {
# Filtering the data based on the academic_year_filter
filtered_data <- data_tweets %>%
filter(academic_year == academic_year_filter)
# Count of days for each HEI
unique_days <- filtered_data %>%
group_by(id) %>%
summarise(unique_days = n_distinct(as.Date(created_at)))
# Count of tweets for each id
number_tweets_boolean <- filtered_data %>%
group_by(id) %>%
summarise(count = n())
# Naming the column respecting the time period
year <- ifelse(academic_year_filter, "academic_time", "vacation_time")
column_name <- paste0("avg_tweets_in_", year)
# Combination of data and calculation of average posts per day
combined_data <- left_join(unique_days, number_tweets_boolean, by = "id")
combined_data <- combined_data %>%
mutate(!!column_name := count / unique_days)
print(combined_data)
return(combined_data)
}
data_tweets_academic <- analyze_tweets()
data_tweets_vacations <- analyze_tweets(academic_year_filter = FALSE)
barplot(data_tweets_academic$avg_tweets_in_academic_time,
names.arg = data_tweets_academic$id,
main = "Average Tweets during Academic Time",
xlab = "HEI",
ylab = "Average Number of Tweets",
ylim = c(0, max(data_tweets_academic$avg_tweets_in_academic_time) + 5),
las = 2,
col = "#34495E")
text(x = barplot(data_tweets_academic$avg_tweets_in_academic_time, plot = FALSE),
y = data_tweets_academic$avg_tweets_in_academic_time,
labels = round(data_tweets_academic$avg_tweets_in_academic_time, 2),
pos = 3)
barplot(data_tweets_vacations$avg_tweets_in_vacation_time,
names.arg = data_tweets_vacations$id,
main = "Average Tweets during Vacation Time",
xlab = "HEI",
ylab = "Average Number of Tweets",
ylim = c(0, max(data_tweets_vacations$avg_tweets_in_vacation_time) + 5),
las = 2,
col = "#D35400")
text(x = barplot(data_tweets_vacations$avg_tweets_in_vacation_time, plot = FALSE),
y = data_tweets_vacations$avg_tweets_in_vacation_time,
labels = round(data_tweets_vacations$avg_tweets_in_vacation_time, 2),
pos = 3)
# Creating new table that contains a new column for the day of the week
data_tweets_days <- data_tweets %>%
mutate(day_of_week = weekdays(created_at))
# Selecting only the id, created_at, and day_of_week columns for the new table
data_tweets_days <- data_tweets_days %>%
select(id, created_at, day_of_week)
print(data_tweets_days)
# Grouping by id and day_of_week, then counting the number of tweets
number_tweets_days <- data_tweets_days %>%
group_by(id, day_of_week) %>%
summarise(count = n())
`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
# Grouping by id, day_of_week and day created at, then counting the number of tweets
number_tweets_per_day <- data_tweets_days %>%
mutate(created_date = as.Date(created_at)) %>%
group_by(id, day_of_week, created_date) %>%
summarise(count = n())
`summarise()` has grouped output by 'id', 'day_of_week'. You can override using the `.groups` argument.
# Finding for each HEI the average count of tweets per day
average_number_tweets_per_day <- number_tweets_per_day %>%
group_by(id, day_of_week) %>%
summarise(average_count = mean(count))
`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
print(number_tweets_days)
# Finding the HEI with the lowest count of tweets per day
lowest_count <- number_tweets_days %>%
group_by(day_of_week) %>%
slice_min(order_by = count) %>%
select(day_of_week, id, count)
# Same but highest count of tweets per day
highest_count <- number_tweets_days %>%
group_by(day_of_week) %>%
slice_max(order_by = count) %>%
select(day_of_week, id, count)
# Combine the results
high_low_HEI <- bind_rows(lowest_count, highest_count) %>%
arrange(day_of_week)
print(high_low_HEI)
ggplot(high_low_HEI, aes(x = day_of_week, y = count, fill = id)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = count),
position = position_dodge(width = 0.9),
vjust = -0.5,
size = 3) +
labs(title = "Lowest and Highest Count of Tweets per Day for Each Day of the Week",
x = "Day of the Week", y = "Count") +
scale_fill_manual(values = rainbow(length(unique(high_low_HEI$id)))) +
theme_minimal() +
theme(legend.title = element_blank())
# Finding the HEI with lowest and highest averaged count of tweets per day
high_low_average_HEIs <- average_number_tweets_per_day %>%
group_by(day_of_week) %>%
filter(average_count == max(average_count) | average_count == min(average_count)) %>%
arrange(day_of_week, ifelse(average_count == min(average_count), average_count, -average_count))
print(high_low_average_HEIs)
ggplot(high_low_average_HEIs, aes(x = day_of_week, y = average_count, fill = id)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = round(average_count, 2)),
position = position_dodge(width = 0.7),
vjust = -0.5,
size = 3) +
labs(title = "Highest and Lowest Average Count of Tweets per Day for Each Day of the Week",
x = "Day of the Week", y = "Average Count") +
scale_fill_manual(values = rainbow(length(unique(high_low_HEI$id)))) +
theme_minimal() +
theme(legend.title = element_blank())
# Table containing views, likes, retweets and replys for each media type for each HEI
types_of_tweets <- data_tweets %>%
group_by(id, media_type) %>%
summarise(count = n(),
total_views = sum(view_count, na.rm = TRUE),
total_likes = sum(favorite_count, na.rm = TRUE),
total_retweets = sum(retweet_count, na.rm = TRUE),
total_replys = sum(reply_count, na.rm = TRUE))
`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
print(types_of_tweets)
# Grouping by HEI and calculating the total values of views, likes and replys across all media types
total_tweets_stats <- types_of_tweets %>%
group_by(id) %>%
summarise(total_views = sum(total_views),
total_likes = sum(total_likes),
total_replys = sum(total_replys))
print(total_tweets_stats)
pie_maker <- function(target_id = "duke.csv"){
# Filtering data for the specific HEI
hei_data <- types_of_tweets %>%
filter(id == target_id)
# Calculate total views for each media type for the specific ID
hei_media_views <- hei_data %>%
group_by(media_type) %>%
summarise(total_views = sum(total_views),
total_likes = sum(total_likes),
total_replys = sum(total_replys))
# Calculating the percentage of views for each media type for the specific ID
hei_media_views$percentage_view <- hei_media_views$total_views / sum(hei_media_views$total_views) * 100
hei_media_views$percentage_like <- hei_media_views$total_likes / sum(hei_media_views$total_likes) * 100
hei_media_views$percentage_reply <- hei_media_views$total_replys / sum(hei_media_views$total_replys) * 100
# Creating the pie chart for views
hei_pie_chart_views <- ggplot(hei_media_views, aes(x = "", y = percentage_view, fill = media_type)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
theme_void() +
theme(legend.position = "right") +
geom_text(aes(label = paste(media_type, "\n", total_views, "(", round(percentage_view, 1), "%)")), position = position_stack(vjust = 0.5), color = "#FFFFFF") +
scale_fill_manual(values = c("no_media" = "#2196F3", "animated_gif" = "#E67E22", "photo" = "#8E44AD", "video" = "#138D75")) +
labs(title = paste("Views for each media type -", target_id))
# Creating the pie chart for likes
hei_pie_chart_likes <- ggplot(hei_media_views, aes(x = "", y = percentage_view, fill = media_type)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
theme_void() +
theme(legend.position = "right") +
geom_text(aes(label = paste(media_type, "\n", total_views, "(", round(percentage_view, 1), "%)")), position = position_stack(vjust = 0.5), color = "#FFFFFF") +
scale_fill_manual(values = c("no_media" = "#E91E63", "animated_gif" = "#4A148C", "photo" = "#90CAF9", "video" = "#00BFA5")) +
labs(title = paste("Likes for each media type -", target_id))
# Creating the pie chart for replys
hei_pie_chart_replys <- ggplot(hei_media_views, aes(x = "", y = percentage_view, fill = media_type)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
theme_void() +
theme(legend.position = "right") +
geom_text(aes(label = paste(media_type, "\n", total_views, "(", round(percentage_view, 1), "%)")), position = position_stack(vjust = 0.5), color = "#FFFFFF") +
scale_fill_manual(values = c("no_media" = "#666600", "animated_gif" = "#99CCCC", "photo" = "#9966CC", "video" = "#330000")) +
labs(title = paste("Likes for each media type -", target_id))
# Print the pie charts
print(hei_pie_chart_views)
print(hei_pie_chart_likes)
print(hei_pie_chart_replys)
}
pie_maker()
pie_maker("epfl.csv")
pie_maker("goe.csv")
pie_maker("harvard.csv")
pie_maker("leicester.csv")
pie_maker("manchester.csv")
pie_maker("mit.csv")
pie_maker("sb.csv")
pie_maker("stanford.csv")
pie_maker("trinity.csv")
pie_maker("wv.csv")
pie_maker("yale.csv")
# Calculation of like_ratio and replys_ratio percentages
ratios_tweets_table <- total_tweets_stats %>%
mutate(like_ratio = total_likes / total_views * 100,
replys_ratio = total_replys / total_views * 100)
# Creation of new table with each HEI, like_ratio, and replys_ratio
hei_tweets_ratios <- ratios_tweets_table %>%
select(id, like_ratio, replys_ratio) %>%
distinct()
print(hei_tweets_ratios)
ggplot(hei_tweets_ratios, aes(x = id)) +
geom_bar(aes(y = like_ratio, fill = "Like Ratio"), stat = "identity", position = "dodge") +
geom_bar(aes(y = replys_ratio, fill = "Replys Ratio"), stat = "identity", position = "dodge") +
geom_text(aes(y = like_ratio, label = round(like_ratio, 2)), vjust = -0.5, position = position_dodge(width = 0.9), size = 3, color = "#000000") +
geom_text(aes(y = replys_ratio, label = round(replys_ratio, 2)), vjust = -0.5, position = position_dodge(width = 0.9), size = 3, color = "#FFFFFF") +
labs(title = "Like and Replys Ratios by HEI",
x = "HEI",
y = "Ratio (%)",
fill = "Metric") +
scale_fill_manual(values = c("Like Ratio" = "#2196F3", "Replys Ratio" = "#F44336")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
# Table with averages of views, likes, retweets and replys
types_of_tweets_per_tweet <- types_of_tweets %>%
group_by(id, media_type) %>%
summarise(avg_views = mean(total_views / count),
avg_likes = mean(total_likes / count),
avg_retweets = mean(total_retweets / count),
avg_replys = mean(total_replys / count))
`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
print(types_of_tweets_per_tweet)
# Grouping by HEI and calculating the average values of views, likes and replys across all media types
total_average_stats <- types_of_tweets_per_tweet %>%
group_by(id) %>%
summarise(avg_views = sum(avg_views),
avg_likes = sum(avg_likes),
avg_replys = sum(avg_replys))
print(total_average_stats)
# Calculation of like_ratio and replys_ratio percentages
ratios_average_table <- total_average_stats %>%
mutate(like_ratio = avg_likes / avg_views * 100,
replys_ratio = avg_replys / avg_views * 100)
# Creation of new table with each HEI, like_ratio, and replys_ratio
hei_average_ratios <- ratios_average_table %>%
select(id, like_ratio, replys_ratio) %>%
distinct()
print(hei_average_ratios)
ggplot(hei_average_ratios, aes(x = id)) +
geom_bar(aes(y = like_ratio, fill = "Like Ratio"), stat = "identity", position = "dodge") +
geom_bar(aes(y = replys_ratio, fill = "Replys Ratio"), stat = "identity", position = "dodge") +
geom_text(aes(y = like_ratio, label = round(like_ratio, 2)), vjust = -0.5, position = position_dodge(width = 0.9), size = 3, color = "#000000") +
geom_text(aes(y = replys_ratio, label = round(replys_ratio, 2)), vjust = -0.5, position = position_dodge(width = 0.9), size = 3, color = "#FFFFFF") +
labs(title = "Like and Replys Ratios by HEI",
x = "HEI",
y = "Ratio (%)",
fill = "Metric") +
scale_fill_manual(values = c("Like Ratio" = "#330066", "Replys Ratio" = "#FF6666")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
# Create column hour from created_at
data_tweets_days$created_hour <- as.numeric(format(data_tweets_days$created_at, "%H"))
heatmap_maker <- function(target_id = "duke.csv"){
# Filtering data for the specific HEI
target_data <- data_tweets_days %>%
filter(id == target_id)
# Grouping by day of the week and hour, and counting the number of tweets
tweet_counts <- target_data %>%
group_by(day_of_week, created_hour) %>%
summarise(num_tweets = n())
# Plotting heatmap
ggplot(tweet_counts, aes(x = day_of_week, y = created_hour, fill = num_tweets)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = paste("Tweet Heatmap for", target_id),
x = "Day of the week",
y = "Hour of the day")
}
heatmap_maker()
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("epfl.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("goe.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("harvard.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("leicester.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("manchester.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("mit.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("sb.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("stanford.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("trinity.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("wv.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("yale.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
data_tweets_content <- data_tweets %>%
select(id, text)
# Counting number of words
data_tweets_content <- data_tweets_content %>%
mutate(num_words = lengths(strsplit(text, "\\s+")))
print(data_tweets_content)
# Grouping by HEI and calculate average, minimum, and maximum values of number of words
data_tweets_content_metrics <- data_tweets_content %>%
group_by(id) %>%
summarise(average_num_words = mean(num_words),
min_num_words = min(num_words),
max_num_words = max(num_words))
print(data_tweets_content_metrics)
ggplot(data_tweets_content_metrics, aes(x = id, y = average_num_words)) +
geom_point(aes(color = "Average")) +
geom_errorbar(aes(ymin = min_num_words, ymax = max_num_words, color = "Range"), width = 0.2) +
scale_color_manual(values = c("Average" = "#1976D2", "Range" = "#EF5350")) +
labs(title = "Word Count Summary by HEI",
x = "HEI",
y = "Number of Words",
color = "Metric") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
data_replies <- data[data$type == "Reply", ]
data_replies
# Table containing views, likes, retweets and replys for each media type for each HEI
types_of_replies <- data_replies %>%
group_by(id, media_type) %>%
summarise(count = n(),
total_views = sum(view_count, na.rm = TRUE),
total_likes = sum(favorite_count, na.rm = TRUE),
total_retweets = sum(retweet_count, na.rm = TRUE),
total_replys = sum(reply_count, na.rm = TRUE))
`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
print(types_of_replies)
# Grouping by HEI and calculating the total values of views, likes and replys across all media types
total_replies_stats <- types_of_replies %>%
group_by(id) %>%
summarise(total_views = sum(total_views),
total_likes = sum(total_likes),
total_replys = sum(total_replys))
print(total_replies_stats)
# Calculation of like_ratio and replys_ratio percentages
ratios_replies_table <- total_replies_stats %>%
mutate(like_ratio = total_likes / total_views * 100,
replys_ratio = total_replys / total_views * 100)
# Creation of new table with each HEI, like_ratio, and replys_ratio
hei_replies_ratios <- ratios_replies_table %>%
select(id, like_ratio, replys_ratio) %>%
distinct()
print(hei_replies_ratios)
# Creating table for cluster algorithms
# Joining attribute count (number of tweets) and unique_hashtags (number of unique hashtags) per HEI
cluster_table <- merge(select(unique_hashtags, id, unique_hashtags), select(number_tweets, id, count), by = "id", all=TRUE)
# Joining attribute avg_tweets_per_days (average of tweets per day) per HEI
cluster_table <- merge(cluster_table, select(tweets_per_day, id, avg_tweets_per_days), by = "id", all=TRUE)
# Joining attribute avg_tweets_per_weeks (average of tweets per week) per HEI
cluster_table <- merge(cluster_table, select(tweets_per_week, id, avg_tweets_per_weeks), by = "id", all=TRUE)
# Joining attribute avg_tweets_in_academic_time (average of tweets during academic time) per HEI
cluster_table <- merge(cluster_table, select(data_tweets_academic, id, avg_tweets_in_academic_time), by = "id", all=TRUE)
# Joining attribute avg_tweets_in_vacation_time (average of tweets during vacation time) per HEI
cluster_table <- merge(cluster_table, select(data_tweets_vacations, id, avg_tweets_in_vacation_time), by = "id", all=TRUE)
# Joining attribute total_views (total number of views), total_likes (total number of likes) and total_replys (total number of replies) per HEI
cluster_table <- merge(cluster_table, select(total_tweets_stats, id, total_views, total_likes, total_replys), by = "id", all=TRUE)
# Joining attribute like_ratio (ratio of total number of likes) and replys_ratio (ratio of total number of replies) per HEI
cluster_table <- merge(cluster_table, select(hei_tweets_ratios, id, like_ratio, replys_ratio), by = "id", all=TRUE)
cluster_table <- cluster_table %>%
rename(total_like_ratio = like_ratio,
total_replys_ratio = replys_ratio)
# Joining attribute avg_views (average number of views), avg_likes (average number of likes) and avg_replys (average number of replies) per HEI
cluster_table <- merge(cluster_table, select(total_average_stats, id, avg_views, avg_likes, avg_replys), by = "id", all=TRUE)
# Joining attribute like_ratio (ratio of average number of likes) and replys_ratio (ratio of average number of replies) per HEI
cluster_table <- merge(cluster_table, select(hei_average_ratios, id, like_ratio, replys_ratio), by = "id", all=TRUE)
cluster_table <- cluster_table %>%
rename(avg_like_ratio = like_ratio,
avg_replys_ratio = replys_ratio)
print(cluster_table)
cluster_maker <- function(seed = 123, num_clusters = 3, table){
set.seed(123)
# Excluding id column for clustering
cluster_data <- select(table, -id)
# Scaling the data for kmeans method
scaled_data <- scale(cluster_data)
kmeans_result <- kmeans(scaled_data, centers = num_clusters)
print(kmeans_result$centers)
print(kmeans_result$cluster)
return(kmeans_result)
}
cluster_id_maker <- function(kmeans_result, table){
# Merging the cluster assignments with the original data
cluster_assignments <- data.frame(id = table$id, cluster = kmeans_result$cluster)
print(cluster_assignments)
plot(kmeans_result$cluster)
}
cluster_123_3 <- cluster_maker(table = cluster_table)
unique_hashtags count avg_tweets_per_days avg_tweets_per_weeks avg_tweets_in_academic_time avg_tweets_in_vacation_time total_views total_likes
1 -0.6381349 -0.5376116 -0.5344623 -0.5344623 -0.5093965 -0.5929981 -0.3102091 -0.1331381
2 -0.3232547 1.3319287 1.3324807 1.3324807 1.2897365 1.3561417 1.2841600 1.2437001
3 0.4806948 -0.3971585 -0.3990092 -0.3990092 -0.3901700 -0.3815718 -0.4869754 -0.5552810
total_replys total_like_ratio total_replys_ratio avg_views avg_likes avg_replys avg_like_ratio avg_replys_ratio
1 -0.03547406 1.3168443 1.14394012 -0.04762103 0.5778107 0.5798181 1.4433344 1.09974566
2 1.11229258 -0.2089453 0.07129591 1.21230324 0.7827831 0.6292289 -0.3283523 -0.01971601
3 -0.53840926 -0.5539495 -0.60761801 -0.58234111 -0.6802969 -0.6045235 -0.5574910 -0.54001482
[1] 1 3 3 2 3 3 2 3 1 3 1 2
cluster_id_maker(cluster_123_3, table = cluster_table)
cluster_123_6 <- cluster_maker(num_clusters = 6, table = cluster_table)
unique_hashtags count avg_tweets_per_days avg_tweets_per_weeks avg_tweets_in_academic_time avg_tweets_in_vacation_time total_views total_likes
1 -0.4103492 -1.3031254 -1.3032034 -1.3032034 -1.2416625 -1.0231249 -0.6375984 -0.66179186
2 -0.3366538 2.0980363 2.1001655 2.1001655 2.2152351 2.2141794 2.7698115 2.76526795
3 1.3672724 0.7190425 0.7198496 0.7198496 0.7281241 0.7223908 -0.2999826 -0.35327890
4 -0.6314353 -0.9289798 -0.9124788 -0.9124788 -0.8237303 -0.8263586 -0.2090329 0.07640829
5 -0.2808240 -0.5708690 -0.5746071 -0.5746071 -0.5989595 -0.6377948 -0.5055879 -0.58466068
6 -0.6269689 -0.1034839 -0.1067369 -0.1067369 -0.1791121 -0.2061612 0.1645105 0.21131146
total_replys total_like_ratio total_replys_ratio avg_views avg_likes avg_replys avg_like_ratio avg_replys_ratio
1 -0.61859306 0.9061875 -0.73455883 -0.6997429 -0.7372601 -0.68389426 -0.2923006 -0.90543423
2 2.91587745 -0.1894786 0.24320057 2.1085449 1.4654190 1.59191180 -0.3555917 0.18423794
3 -0.31932304 -0.5257982 -0.10316261 -0.5364867 -0.6301142 -0.54350287 -0.5247867 -0.25353926
4 0.55340665 1.7400189 2.73035156 0.4273143 1.9304181 2.31424168 2.6027601 2.94250000
5 -0.55967675 -0.9230136 -0.72436706 -0.5414191 -0.6611809 -0.58166179 -0.5983170 -0.51410929
6 -0.07123056 0.6299026 0.08119858 0.4658671 0.4051027 0.05107825 0.4714812 0.02721398
[1] 6 5 1 2 3 5 6 5 4 3 6 3
cluster_id_maker(cluster_123_6, table = cluster_table)
cluster_123_3 <- cluster_maker(seed = 4855, num_clusters = 4, table = cluster_table)
unique_hashtags count avg_tweets_per_days avg_tweets_per_weeks avg_tweets_in_academic_time avg_tweets_in_vacation_time total_views total_likes
1 -0.3132053 -0.7539331 -0.7567562 -0.7567562 -0.7596352 -0.7341273 -0.5385905 -0.6039435
2 -0.4672956 1.2357198 1.2354314 1.2354314 1.1911790 1.2741657 1.9924687 1.9375124
3 1.3672724 0.7190425 0.7198496 0.7198496 0.7281241 0.7223908 -0.2999826 -0.3532789
4 -0.6381349 -0.5376116 -0.5344623 -0.5344623 -0.5093965 -0.5929981 -0.3102091 -0.1331381
total_replys total_like_ratio total_replys_ratio avg_views avg_likes avg_replys avg_like_ratio avg_replys_ratio
1 -0.57440583 -0.4657134 -0.7269150 -0.58100004 -0.6802007 -0.6072199 -0.5218129 -0.61194052
2 1.68100730 -0.2551425 -0.1073362 2.03816171 1.4388566 1.1599670 -0.3341956 -0.04542855
3 -0.31932304 -0.5257982 -0.1031626 -0.53648673 -0.6301142 -0.5435029 -0.5247867 -0.25353926
4 -0.03547406 1.3168443 1.1439401 -0.04762103 0.5778107 0.5798181 1.4433344 1.09974566
[1] 4 1 1 2 3 1 2 1 4 3 4 3
cluster_id_maker(cluster_123_3, table = cluster_table)
cluster_123_6 <- cluster_maker(seed = 4855, num_clusters = 6, table = cluster_table)
unique_hashtags count avg_tweets_per_days avg_tweets_per_weeks avg_tweets_in_academic_time avg_tweets_in_vacation_time total_views total_likes
1 -0.4103492 -1.3031254 -1.3032034 -1.3032034 -1.2416625 -1.0231249 -0.6375984 -0.66179186
2 -0.3366538 2.0980363 2.1001655 2.1001655 2.2152351 2.2141794 2.7698115 2.76526795
3 1.3672724 0.7190425 0.7198496 0.7198496 0.7281241 0.7223908 -0.2999826 -0.35327890
4 -0.6314353 -0.9289798 -0.9124788 -0.9124788 -0.8237303 -0.8263586 -0.2090329 0.07640829
5 -0.2808240 -0.5708690 -0.5746071 -0.5746071 -0.5989595 -0.6377948 -0.5055879 -0.58466068
6 -0.6269689 -0.1034839 -0.1067369 -0.1067369 -0.1791121 -0.2061612 0.1645105 0.21131146
total_replys total_like_ratio total_replys_ratio avg_views avg_likes avg_replys avg_like_ratio avg_replys_ratio
1 -0.61859306 0.9061875 -0.73455883 -0.6997429 -0.7372601 -0.68389426 -0.2923006 -0.90543423
2 2.91587745 -0.1894786 0.24320057 2.1085449 1.4654190 1.59191180 -0.3555917 0.18423794
3 -0.31932304 -0.5257982 -0.10316261 -0.5364867 -0.6301142 -0.54350287 -0.5247867 -0.25353926
4 0.55340665 1.7400189 2.73035156 0.4273143 1.9304181 2.31424168 2.6027601 2.94250000
5 -0.55967675 -0.9230136 -0.72436706 -0.5414191 -0.6611809 -0.58166179 -0.5983170 -0.51410929
6 -0.07123056 0.6299026 0.08119858 0.4658671 0.4051027 0.05107825 0.4714812 0.02721398
[1] 6 5 1 2 3 5 6 5 4 3 6 3
cluster_id_maker(cluster_123_6, table = cluster_table)